Setup

load libraries and define functions

library(DESeq2) 
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(limma)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
## 
##     plotMA
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
library(EnhancedVolcano)
## Loading required package: ggplot2
## Loading required package: ggrepel
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
library(AnnotationDbi)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(msigdbr)
library(clusterProfiler)
## 
## clusterProfiler v4.2.2  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
## 
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
library(scales)
library(ggpubr)
library(ggplot2)
library(enrichplot)
## 
## Attaching package: 'enrichplot'
## The following object is masked from 'package:ggpubr':
## 
##     color_palette
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(pheatmap)
library(org.Hs.eg.db)
## 
library(xCell)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:AnnotationDbi':
## 
##     select
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# function to read experiment designs from our datasets
read_design <- function(filename) {
  E <- read.csv(filename, sep = "\t")

  rownames(E) <- E$Run
  
  E <- E[c("Sample.Characteristic.disease.", "Sample.Characteristic.organism.part.")]
  colnames(E) <- c("Diagnosis", "Brodmann")

  E[E == "bipolar disorder"] = "bipolar"
  E$`Brodmann` <- sub("Brodmann \\(1909\\) area ", "", E$`Brodmann`)
  
  E[E == "dorsolateral prefrontal cortex"] = "46"
  
  return(E)
}

Load Data

# read the experiments' designs
E7 <- read_design("../Databases/Bipolar - E-GEOD-78936/E-GEOD-78936-experiment-design.tsv")
E5 <- read_design("../Databases/Bipolar - E-GEOD-53239/E-GEOD-53239-experiment-design.tsv")

# add identifiers for each dataset
E5 <- cbind(rep("GEOD-53239", times=nrow(E5)), E5)
E7 <- cbind(rep("GEOD-78936", times=nrow(E7)), E7)

colnames(E5)[1] = colnames(E7)[1] = "Dataset"

# combine metadata
metadata <- rbind(E5, E7)

# read counts data
data1 <- read.delim("../Databases/Bipolar - E-GEOD-53239/E-GEOD-53239-raw-counts.tsv")
data2 <- read.delim("../Databases/Bipolar - E-GEOD-78936/E-GEOD-78936-raw-counts.tsv")

# combine only the counts data
counts <- cbind(data1[-1][-1], data2[-1][-1])
counts <- counts[sort(colnames(counts))]

# take the gene names from the counts data
genes.symbols <- data1[,2]

# if there is no gene name, use the ID
genes.symbols[genes.symbols == ""] <- data1$Gene.ID[genes.symbols == ""]

# replace dupliactes with ID too
genes.symbols[duplicated(genes.symbols)] <- data1$Gene.ID[duplicated(genes.symbols)]

Identifying Biomarker Genes

Run DESeq2

# run DESeq2 on combined data from both experiments
# using Dataset to distinguish between them which will remove the batch effect. 
# Since Dataset is a division contained in Brodmann, using Brodmann is enough
dds <- DESeqDataSetFromMatrix(countData=counts, colData=metadata, design= ~Brodmann + Diagnosis)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 18 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
rownames(dds) <- genes.symbols

# run different DESeq2 for each brain area
areas <- c(9, 11, 24, 46)
areaDds <- list()

for (i in seq_along(areas)){
  areaDds[[i]] <- DESeqDataSetFromMatrix(countData=counts[,metadata$Brodmann == areas[i]], colData=metadata[metadata$Brodmann == areas[i],], design= ~Diagnosis)
  areaDds[[i]] <- DESeq(areaDds[[i]])
}
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 92 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 45 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 15 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 54 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing

Analyze the data:

# do logfold change for Diagnosis
# normal vs bipolar
resLFC.bipolar.normal <- lfcShrink(dds, contrast = c("Diagnosis", "bipolar", "normal"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
resLFC.bipolar.normal["symbol"] <- genes.symbols
resOrdered.bipolar.normal <- resLFC.bipolar.normal[order(resLFC.bipolar.normal$pvalue),]

# bipolar vs schizophrenia
resLFC.bipolar.schizo <- lfcShrink(dds, contrast = c("Diagnosis", "bipolar", "schizophrenia"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
resLFC.bipolar.schizo["symbol"] <- genes.symbols
resOrdered.bipolar.schizo <- resLFC.bipolar.schizo[order(resLFC.bipolar.schizo$pvalue),]

# differentiate brain areas
resLFC.areas <- list()
resOrdered.areas <- list()
comparisons <- list()

j <- 1

for (i in seq_along(areas)){
  controls = c("normal", "schizophrenia")
  if (areas[i] == 46){
    # area 46 has no SZ patients
    controls = controls[1]
  }
  
  for (control in controls){
    comparison <- paste0("bipolar vs ", control, " in ", areas[i])
    print(paste0("Calculating ", comparison))
    resLFC.areas[[j]] <- lfcShrink(areaDds[[i]], contrast = c("Diagnosis", "bipolar", control), type = "ashr")
    resLFC.areas[[j]]["symbol"] <- genes.symbols
    resOrdered.areas[[j]] <- resLFC.areas[[j]][order(resLFC.areas[[j]]$pvalue),]
    comparisons[[j]] <- comparison
    j = j + 1
  }
}
## [1] "Calculating bipolar vs normal in 9"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs schizophrenia in 9"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs normal in 11"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs schizophrenia in 11"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs normal in 24"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs schizophrenia in 24"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs normal in 46"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
# normalize our data
dds.norm <- vst(dds, blind=F)
rownames(dds.norm) <- genes.symbols

# remove batch effect for PCA
mat <- assay(dds.norm)
mat <- removeBatchEffect(mat, dds.norm$Dataset)
assay(dds.norm) <- mat

Make some nice looking graphs :)

# create volcano plots
EnhancedVolcano(resOrdered.bipolar.normal, lab = resOrdered.bipolar.normal$symbol, x = 'log2FoldChange', y = 'padj', FCcutoff=2, pCutoff = 10e-10, title="bipolar vs normal")

EnhancedVolcano(resOrdered.bipolar.schizo, lab = resOrdered.bipolar.schizo$symbol, x = 'log2FoldChange', y = 'padj', FCcutoff=2, pCutoff = 10e-10, title="bipolar vs schizophrenia")

for (i in seq_along(comparisons)){
  print(EnhancedVolcano(resOrdered.areas[[i]], lab = resOrdered.areas[[i]]$symbol, x = 'log2FoldChange', y = 'padj', FCcutoff=2, pCutoff = 10e-10, title=comparisons[i]))
}
## Warning: Removed 2 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_hline).

## Warning: Removed 2 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).

## Warning: Removed 2 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).

## Warning: Removed 2 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).

## Warning: Removed 1 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).

## Warning: Removed 1 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).

# make some pca plots
plotPCA(dds.norm, intgroup=c("Diagnosis"))

plotPCA(dds.norm, intgroup=c("Brodmann"))

plotPCA(dds.norm, intgroup=c("Diagnosis", "Brodmann"))

Some count plots for significant genes :)

# significant genes to check in violin plots
genes <- c("LINC02340", "MTND6P4", "MT1X", "IL1RL1")

# compare the means of each two groups
means_comp <- list(c("bipolar", "normal"), c("bipolar", "schizophrenia"), c("normal", "schizophrenia"))

# make the violin plots
for (gene in genes) {
  gene.index <- which(genes.symbols %in% gene)
  plot <- plotCounts(dds, gene = rownames(dds)[gene.index], intgroup = c('Diagnosis'), xlab="Diagnosis", title=paste0(gene, " counts"), returnData = T)
  print(
    ggplot(plot, aes(x = Diagnosis, y = count, fill=Diagnosis)) + 
      geom_violin(draw_quantiles = 0.5) + 
      geom_jitter(color="royalblue", size=0.6, alpha=0.5) + 
      theme(legend.position="none")  + 
      ggtitle(paste0("Gene Expression of ", gene)) +
      scale_y_continuous(trans = log10_trans()) + 
      stat_compare_means(label = "p.signif", comparisons = means_comp,  method = "wilcox.test")
    )
}

# significant genes in single areas to check in violin plots
areaSpecific <- data.frame(GeneName = c("IGKC", "CHI3L2", "MT1X", "MTND6P4"), Brodmann = c(46, 46, 46, 46))

# make the violin plots
for (geneIndex in 1:nrow(areaSpecific)) {
  area <- areaSpecific[geneIndex,]$Brodmann
  gene <- areaSpecific[geneIndex,]$GeneName
  thisDds = areaDds[[which(areas == area)]]
  rownames(thisDds) <- genes.symbols
  plot <- plotCounts(thisDds, gene = gene, intgroup = c('Diagnosis'), xlab="Diagnosis", title=paste0(gene, " counts"), returnData = T)
  
  comp = means_comp
  if (area == 46){
    # area 46 has no SZ patients
    comp = means_comp[1]
  }
  
  print(
    ggplot(plot, aes(x = Diagnosis, y = count, fill=Diagnosis)) + 
      scale_fill_manual(values=c("#f8766d", "#00ba38", "#619cff")) +
      geom_violin(draw_quantiles = 0.5) + 
      geom_jitter(color="royalblue", size=0.6, alpha=0.5) + 
      theme(legend.position="none")  + 
      ggtitle(paste0("Gene Expression of ", gene, " in ", area)) + 
      scale_y_continuous(trans = log10_trans()) + 
      stat_compare_means(label = "p.signif", comparisons = comp, method = "wilcox.test")
    )
}
## Warning in wilcox.test.default(c(0.277745867479051, -0.301029995663981, : cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(c(-0.301029995663981, 0.359390413188311, : cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(c(0.517353315321231, 0.359390413188311,
## 0.314141856807374, : cannot compute exact p-value with ties

Identifying Enriched Pathways

And Pathway analysis :)

# remove NAs and create sorted genes list with LFC values and ENSMBLE ID as names
DE_genes_entrez_rank <- resOrdered.bipolar.normal[!is.na(resOrdered.bipolar.normal$padj),]
DE_genes_list <- DE_genes_entrez_rank$log2FoldChange
names(DE_genes_list) <- DE_genes_entrez_rank$symbol
DE_genes_list <- sort(DE_genes_list, decreasing=TRUE)

# collect hallmarks
hallmarks <- msigdbr(species = "Homo sapiens", category = "H") %>% dplyr::select(gs_name, gene_symbol)

# set nPerm to million so it's absolutely precise and does not change when the random seed changes
hm <- GSEA(DE_genes_list, TERM2GENE=hallmarks, nPerm=1000000)
## preparing geneSet collections...
## GSEA analysis...
## Warning in .GSEA(geneList = geneList, exponent = exponent, minGSSize =
## minGSSize, : We do not recommend using nPerm parameter incurrent and future
## releases
## Warning in fgsea(pathways = geneSets, stats = geneList, nperm = nPerm, minSize
## = minGSSize, : You are trying to run fgseaSimple. It is recommended to use
## fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument
## in the fgsea function call.
## leading edge analysis...
## done...
# create a ridgeplot
ridgeplot(hm) + labs(x = "enrichment distribution")
## Picking joint bandwidth of 0.0774

# and a dot plot
dotplot(hm, showCategory = 178, title="Pathway Analisys with GSEA, normal VS bipolar")

# We tried running the same code on resOrdered.bipolar.schizo, but it found no enriched terms

Clustering, no interesting data

# get normalized counts only for bipolar
sampleFilter <- dds$Diagnosis != "schizophrenia"
normcounts <- assay(vst(dds[,sampleFilter], blind=T))
colnames(normcounts) <- paste0(dds$Diagnosis[sampleFilter], seq(1, ncol(normcounts)))
var_per_gene <- apply(normcounts, 1, var)

# take 1000 most significant genes
selectedGenes <- names(var_per_gene[order(var_per_gene, decreasing = T)][1:20])
selectedGenes <- genes

normcounts.topGenes <- t(normcounts[selectedGenes,])

# make an elbow plot (optimal k seems to be 2 or 3)
fviz_nbclust(normcounts.topGenes, FUN = hcut, method = "wss")

dist_mat <- dist(normcounts.topGenes, method = 'euclidean')

# plot the cluster dendrogram
hclust_avg <- hclust(dist_mat, method = 'average')
plot(hclust_avg, cex = 0.6, hang = -1, xlab="", ylab ="", sub="", axes=F)
rect.hclust(hclust_avg, k = 3, border = 2:8)

Heatmap

# take normalized counts for normal and bipolar
sampleFilter <- dds$Diagnosis != "schizophrenia"
normcounts <- assay(vst(dds[,sampleFilter], blind=T))

# create labels for heatmap
df <- data.frame(row.names = colnames(dds[,sampleFilter]),
                 diagnosis = colData(dds[,sampleFilter])$Diagnosis,
                  area = colData(dds[,sampleFilter])$Brodmann)

# Take top 10 genes with the lowest p-value that express in Bipolar (log2FoldChange>0)
selectUp <- resOrdered.bipolar.normal$symbol[resOrdered.bipolar.normal$log2FoldChange>0][1:10]
# Take top 10 genes with the lowest p-value that express in Healthy (log2FoldChange<0)
selectDown <- resOrdered.bipolar.normal$symbol[resOrdered.bipolar.normal$log2FoldChange<0][1:10]

selected <- c(selectUp,selectDown)

# create the heatmap
pheatmap(normcounts[selected,], cluster_rows=TRUE,
         show_colnames = FALSE, cluster_cols=TRUE, 
         annotation_col=df, scale = 'row', cutree_cols = 2, cutree_rows = 2)

Examining the Changes in the Cellular Composition

# set rownames of counts so xCell understands what are the genes
rownames(counts) <- genes.symbols

# run the analysis
counts.xCell <- xCellAnalysis(counts, rnaseq=T)
## [1] "Num. of genes: 10535"
## Setting parallel calculations through a MulticoreParam back-end
## with workers=4 and tasks=100.
## Estimating ssGSEA scores for 489 gene sets.
## 
  |                                               
  |======================================================================| 100%
p <- pheatmap(counts.xCell, show_colnames = F, silent = T)

# filter results as most of it is not valuble information
counts.xCell.filtered <- counts.xCell[rowMeans(counts.xCell) > 0.01,]

# scale for heatmap, avarage should be 0
counts.xCell.filtered.scaled <- t(scale(t(counts.xCell.filtered)))

# clip values
counts.xCell.filtered.scaled.clipped <- counts.xCell.filtered.scaled
counts.xCell.filtered.scaled.clipped[counts.xCell.filtered.scaled > 1] <- 1
counts.xCell.filtered.scaled.clipped[counts.xCell.filtered.scaled < -1] <- -1
# clustering metadata for heatmap
df <- data.frame(row.names = colnames(dds),
                 diagnosis = colData(dds)$Diagnosis,
                 area = colData(dds)$Brodmann)

# show results
pheatmap(counts.xCell.filtered.scaled.clipped, cluster_rows=TRUE,
         show_colnames = FALSE, cluster_cols=TRUE, 
         annotation_col=df, scale = 'row', cutree_cols = 4, cutree_rows = 2)

# analyze clustering
k = 2
myclusters <- cutree(p$tree_col, k=k)

# show correlation between clusters and known factors about the patients
patients <- table(metadata$Diagnosis, myclusters)
mosaicplot(patients)

patients <- table(metadata$Brodmann, myclusters)
mosaicplot(patients)

# create every combination of clusters to see difference in the data
comps <- t(combn(1:k, 2))
comps <- split(comps, seq(nrow(comps)))

# violin plots of cell score in every cluster
for (cellType in rownames(counts.xCell.filtered)) {
  MEscore.df <- data.frame("score" = counts.xCell[cellType,],
                         "cluster" = factor(myclusters),
                         row.names = names(myclusters))
  
  print(
    ggplot(MEscore.df, aes(x=cluster, y=score, fill=cluster)) +
    geom_violin(draw_quantiles = 0.5) +
    labs(x="", y="Score") + stat_compare_means(method = "wilcox.test", comparisons = comps, label = "p.signif") +
      ggtitle(paste0("Cluster comparison of ", cellType))
  )
}

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

# see how the cell type divide the groups on a plot
chosenCells = c("Th1 cells", "Smooth muscle", "MSC")
scores.df <- data.frame("cluster" = factor(myclusters),
                         row.names = names(myclusters),
                        area=paste0(metadata$Brodmann, ":", factor(myclusters)))

# create a 
for (cell in chosenCells) {
  scores.df[cell] <- counts.xCell[cell,]
}

# create a list of every two cell type combination from choseCells
comps <- t(combn(1:length(chosenCells), 2))
comps <- split(comps, seq(nrow(comps)))

# see how well each two cell types divide the patients to the clusters
for (comp in comps){
  print(
    ggplot(scores.df, aes(x=scores.df[,chosenCells[comp[1]]], y=scores.df[,chosenCells[comp[2]]], color=cluster)) + 
      geom_point() + 
      xlab(chosenCells[comp[1]]) + 
      ylab(chosenCells[comp[2]])
  )
}


Overall, we can see that the division is quite good, and that there is some kind of difference between the clusters, but nothing that is expressed in other mediacal data we have